# options
options(stringsAsFactors = F)
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
knitr::opts_knit$set(progress = FALSE)
We feed log transformed values, using log1p()
base R function, for accessibility at 15,000 regions in the genome with the most variance in chromatin accessibility, all transcripts and all proteins with abundance measurements into MOFA+ for model generation.
# create the MOFA object
all_df_shared_MOFAobject <- create_mofa(all_df_shared)
# Load in model results generated using _src/MOFA_train_model_CPU.r
all_df_shared_model <- load_model(here("../pQTL_website/_data/all_df_shared_30factors_2022-04-11.hdf5"), remove_inactive_factors = F)
# Warning message:
# In .quality_control(object, verbose = verbose) :
# Factor(s) 3 are strongly correlated with the total number of expressed features for at least one of your omics. Such factors appear when there are differences in the total 'levels' between your samples, *sometimes* because of poor normalisation in the preprocessing steps.
# Removing Factor 3 that shows high correlation to the total # of expressed features.
all_df_shared_model <- subset_factors( all_df_shared_model, factors = c(1:2, 4:30))
# Removing Factors that don't explain at least 1% of variation in a data set.
all_df_shared_factor_var <- all_df_shared_model@cache$variance_explained$r2_per_factor %>%
as.data.frame() %>%
rownames_to_column() %>%
filter(single_group.Chromatin > 1 |
single_group.Protein > 1 |
single_group.Transcript > 1) %>%
column_to_rownames()
all_df_shared_model_filtered <- subset_factors(all_df_shared_model,
factors = as.numeric(gsub("Factor","",rownames(all_df_shared_factor_var))))
# updating the sexes in the metadata to use Female/Male instead of 0s and 1s.
merged_metadata <- merged.covar2 %>%
as.data.frame() %>%
rownames_to_column() %>%
select(-lifr) %>%
rename("sample"="rowname")
shared_samples_metadata <- all_df_shared_model_filtered@samples_metadata %>%
left_join(.,merged_metadata) %>%
mutate(sex = ifelse(sex ==0,"Female", "Male"))
samples_metadata(all_df_shared_model_filtered) <- shared_samples_metadata
# Get % variance explained for each factor
all_df_shared_var_explained <- (calculate_variance_explained(all_df_shared_model_filtered))
# Get Factor weights
all_df_shared_factors <- get_factors(all_df_shared_model_filtered,
factors = "all",
as.data.frame = T
)
# Add sample details and convert to matrix with Factors in rows and samples in columns.
all_df_shared_factors_mat <- all_df_shared_factors %>%
pivot_wider(id_cols = "sample", names_from = "factor", values_from = "value") %>%
left_join(., select(threeway.shared.samples, top_muga, sampleid), by =c("sample"="top_muga")) %>%
column_to_rownames("sample") %>%
select(-sampleid) %>%
as.matrix()
# Get feature weights
all_df_shared_weights <- get_weights(all_df_shared_model_filtered, as.data.frame = TRUE, scale = TRUE)
# Correlate Factors with covariates and get R values
correlate_factors_with_covariates(all_df_shared_model_filtered,
covariates = c("sex","lifr_geno"),
plot="r",
return_data = TRUE) %>%
as_tibble( rownames = "Factor") %>%
pivot_longer( cols = c("sex","lifr_geno"),
names_to = "covariate",
values_to = "pearson_r") -> corr_to_cov_r
# Correlate Factors with covariates and get p-values
correlate_factors_with_covariates(all_df_shared_model_filtered,
covariates = c("sex","lifr_geno"),
plot="log_pval",
return_data = TRUE) %>%
as_tibble( rownames = "Factor") %>%
pivot_longer( cols = c("sex","lifr_geno"),
names_to = "covariate",
values_to = "log_pval") -> corr_to_cov_logpval
# merge R & p-values into a single data frame
corr_to_cov_r %>%
full_join( corr_to_cov_logpval) -> all_df_shared_corr_covar
# get atac-seq peak drivers + run overrepresentation analysis using LOLA
# these are all the ATAC-seq peaks that are fed into MOFA that we will use as the custom background in overrepresentation analysis
background_atac_peaks <- all_df_shared %>%
filter(view == c("Chromatin")) %>%
rename( peak_id = feature) %>%
select( -view,-value, -sample) %>%
distinct() %>%
separate( peak_id, into = c("Chr", "Start","End"), remove = FALSE) %>%
mutate( Chr = gsub("peak","chr",Chr)) %>%
makeGRangesFromDataFrame(.,
keep.extra.columns = F,
seqnames.field = c("Chr"),
start.field = "Start",
end.field = "End")
get_tf_ora <- function( factor_df, factor_num, bg_peaks){
# get top ATAC-seq peak drivers
top_atac_drivers <- factor_df %>%
slice( which( factor_df$value %in%
boxplot.stats( filter(factor_df, factor == factor_num & view =="Chromatin")$value)$out )
)
# convert the top ATAC-seq peak drivers to a compatible object for LOLA
atac_peaks_for_lola <- top_atac_drivers %>%
rename( peak_id = feature) %>%
separate( peak_id, into = c("Chr", "Start","End"), remove = FALSE) %>%
mutate( Chr = gsub("peak","chr",Chr)) %>%
makeGRangesFromDataFrame(.,
keep.extra.columns = F,
seqnames.field = c("Chr"),
start.field = "Start",
end.field = "End")
# run LOLA with custom background
lola_results <- runLOLA(atac_peaks_for_lola,
bg_peaks,
regionDB,
cores=1)
# add q-value for filtering later
lola_results$qValue <- (qvalue( 10^(-lola_results$pValueLog )))$qvalues
return(lola_results)
}
# get_tf_ora( all_df_shared_weights, "Factor1", background_atac_peaks)
# I ran the function above for all factors and saved the output
# below is the filtered results for qvalue <0.05 and cell type as ESCs
load( here("../pQTL_website/_data/","MOFA_Factor_LOLA_results.RData")) # lola_combined
Figure5A_data_var_exp <- all_df_shared_var_explained$r2_per_factor$single_group %>%
as_tibble(rownames = "Factor") %>%
mutate(factor_num = as.numeric(gsub( "Factor" ,"",Factor)) )%>%
arrange((factor_num)) %>%
pivot_longer(c("Chromatin","Protein","Transcript")) %>%
mutate( Factor = factor(Factor, levels = unique(Factor))) %>%
mutate( name = factor( name, levels = c("Chromatin","Transcript","Protein"))) %>%
select( `Data set` = name, Factor, `% Variation Explained` = value)
Figure5A_data_cor_to_covar <- all_df_shared_corr_covar %>%
mutate( Factor = factor(Factor, levels = unique(Factor))) %>%
mutate( covariate = ifelse( covariate =="sex", "Sex", "LIFR genotype")) %>%
mutate( covariate = factor( covariate, levels = c("Sex", "LIFR genotype"))) %>%
select(Factor, covariate, Correlation =pearson_r)
Figure5A_data_tf_ora <- lola_combined %>%
filter( antibody %in% c("Nanog","Sox2","Pou5f1")) %>%
filter( description %in% c("ES cells expressing control shRNA targeting GFP",
"V6.5 (C57BL/6-129) murine ES cells were grown under typical ES conditions on irr",
"N/A")) %>%
filter( !description %in% c("ES cells expressing shRNA targeting Kdm4b",
"ES cells expressing shRNA targeting Kdm4c",
"ES cells cultured in FGF4; heparin and Activin A for 1 day after Oct3/4 deleted",
"ES cells differentiated in vitro for 2 days; Brn2 expression induced using Tet-O",
"ES cells cultured in FGF4; heparin and Activin A for 2 days after Oct3/4 deleted",
"ES cells cultured in FGF4; heparin and Activin A for 3 days after Oct3/4 deleted",
"ES cells differentiated in vitro for 2 days")) %>%
full_join( tibble(
factor = c( paste0("Factor ", seq(1:23))),
factor_num = seq(1:23))
) %>%
group_by(factor, antibody, factor_num) %>%
dplyr::summarize( mean_odds = mean(oddsRatio, na.rm= T)) %>%
ungroup() %>%
arrange(factor_num) %>%
mutate( Factor = factor(factor, levels = unique(factor))) %>%
select(Factor, TF = antibody, `Odds ratio` = mean_odds)
# % variation explained heatmap.
Figure5A_data_var_exp %>%
ggplot()+
aes(
y = Factor,
x = `Data set`,
fill = `% Variation Explained`)+
geom_tile()+
xlab("")+
ylab("")+
theme_pubclean( base_size = 18)+
scale_fill_gradient2( limits = c(0, 10))+
#scale_size_area()+
labs(fill = "% Variation Explained", col = "")+
scale_x_discrete( expand = expansion(mult = 0))+
theme(
legend.position = "top",
legend.title = element_text(size = 18, vjust = 0.9),
legend.text = element_text(angle=45, size =12, vjust = 0.5),
)-> all_df_shared_var_plot
# correlation to covariates.
Figure5A_data_cor_to_covar %>%
filter(covariate == "Sex") %>%
ggplot()+
aes(
x = covariate,
y = Factor,
fill = abs(Correlation)
)+
geom_tile()+
xlab("")+
ylab("")+
theme_pubclean(base_size = 18)+
scale_fill_gradient(low = "white",high = "dark red", limits = c(0,1))+
theme(legend.position = "top",
axis.ticks.y = element_blank(),
legend.text = element_text(angle=45, size =12, vjust = 0.5),
legend.title = element_text(size = 18, vjust = 0.9),
)+
scale_y_discrete( labels = NULL)+
scale_x_discrete( expand = expansion(mult = 0))+
labs( fill = "Correlation") -> all_df_shared_covar_plot_sex
Figure5A_data_cor_to_covar %>%
filter(covariate != "Sex") %>%
ggplot()+
aes(
x = covariate,
y = Factor,
fill = abs(Correlation)
)+
geom_tile()+
xlab("")+
ylab("")+
theme_pubclean(base_size = 18)+
scale_fill_gradient(low = "white",high = "deeppink4", limits = c(0,1))+
theme(legend.position = "top",
axis.ticks.y = element_blank(),
legend.text = element_text(angle=45, size =12, vjust = 0.5),
legend.title = element_text(size =12)
)+
scale_y_discrete( labels = NULL)+
scale_x_discrete( expand = expansion(mult = 0))+
labs( fill = "") -> all_df_shared_covar_plot_lifr
# Odds ratio for TF binding overrepresentation for NANOG, OCT4, SOX2 in ATACseq peak drivers
Figure5A_data_tf_ora %>%
mutate( mean_odds_capped = ifelse( `Odds ratio` >10, 10, `Odds ratio`)) %>%
ggplot()+
aes(
x = TF,
y = Factor,
fill = mean_odds_capped
)+
geom_tile()+
xlab("")+
ylab("")+
theme_pubclean(base_size = 18)+
scale_fill_gradient(low = "light gray",high = "#ffa08aff", limits = c(0,10)) +
theme(legend.position = "top",
legend.text = element_text(angle=45, size =12, vjust = 0.5),
legend.title = element_text(size = 18, vjust = 0.9),
axis.text.x = element_text(angle = 0),
axis.ticks.y = element_blank(),
)+
scale_y_discrete( labels = NULL)+
scale_x_discrete( expand = expansion(mult = 0))+
labs( fill = "Odds ratio") -> lola_sum_figure
figure5a <- ggarrange( all_df_shared_var_plot,
all_df_shared_covar_plot_sex,
all_df_shared_covar_plot_lifr,
lola_sum_figure,
widths = c(0.45,0.15,0.15,0.2), nrow = 1,
labels = "A",
font.label = list( size = 28))
figure5a
MOFA yielded 23 latent factors that capture variation in one or more layers of genomic data. For each factor, percent of variation explained in chromatin accessibility, transcript abundance, and protein abundance is displayed as a heatmap, as is the correlation of each factor to experimental covariates including sex and genotype at the Lifr locus. Heatmap on the right indicates overrepresentation of pluripotency regulator binding sites (NANOG, OCT4 (Pou5f1) and SOX2) among the top chromatin drivers of each factor.
Figure5A_data_var_exp %>%
mutate_if( is.numeric, round, 2) %>%
create_dt()
Figure5A_data_cor_to_covar %>%
mutate_if( is.numeric, round, 2) %>%
create_dt()
Figure5A_data_tf_ora %>%
mutate_if( is.numeric, round, 2) %>%
create_dt()
MOFA factor values (columns) for each of the 163 samples (rows) are listed in the first tab. Weights for each of the 23 MOFA factors (columns) are listed for every protein, transcript, and chromatin peak (rows), with tabs corresponding to the three data types.Feature annotations include Ensembl protein identifier and gene symbol for proteins, Ensembl gene identifier and gene symbol for transcripts, and chromatin peak identifier and proximal gene annotation from R/ChIPseeker for ATAC-seq peaks.